library(tidyverse)
library(NbClust)
library(factoextra)
library(cluster)
library(GGally)
The dataset traveler_reviews.csv was gathered from destination reviews published by 249 reviewers of holidayiq.com till October 2014. Reviews falling in 6 categories among destinations across South India were considered and the count of reviews in each category for every reviewer (traveler) is captured.
The HolidayIQ Club
Attribute Information:
- User Id: Unique user id
- Sports: Number of reviews on stadiums, sports complex, etc.
- Religious: Number of reviews on religious institutions
- Nature: Number of reviews on beach, lake, river, etc.
- Theatre: Number of reviews on theatres, exhibitions, etc.
- Shopping: Number of reviews on malls, shopping places, etc.
- Picnic: Number of reviews on parks, picnic spots, etc.
Mysore Palace in South India
Alleppey Backwaters in South India
d <- read_csv('traveler_reviews.csv')
glimpse(d)
## Observations: 249
## Variables: 7
## $ `User Id` <chr> "User 1", "User 2", "User 3", "User 4", "User 5", "U...
## $ Sports <dbl> 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 5, 5...
## $ Religious <dbl> 77, 62, 50, 68, 98, 52, 64, 54, 64, 86, 107, 103, 64...
## $ Nature <dbl> 79, 76, 97, 77, 54, 109, 85, 107, 108, 76, 54, 60, 8...
## $ Theatre <dbl> 69, 76, 87, 95, 59, 93, 82, 92, 64, 74, 64, 63, 82, ...
## $ Shopping <dbl> 68, 69, 50, 76, 95, 52, 73, 54, 54, 74, 103, 102, 75...
## $ Picnic <dbl> 95, 68, 75, 61, 86, 76, 69, 76, 93, 103, 94, 93, 69,...
summary(d)
## User Id Sports Religious Nature
## Length:249 Min. : 2.00 Min. : 50.0 Min. : 52.0
## Class :character 1st Qu.: 6.00 1st Qu.: 84.0 1st Qu.: 89.0
## Mode :character Median :12.00 Median :104.0 Median :119.0
## Mean :11.99 Mean :109.8 Mean :124.5
## 3rd Qu.:18.00 3rd Qu.:132.0 3rd Qu.:153.0
## Max. :25.00 Max. :203.0 Max. :318.0
## Theatre Shopping Picnic
## Min. : 59.0 Min. : 50.0 Min. : 61.0
## 1st Qu.: 93.0 1st Qu.: 79.0 1st Qu.: 92.0
## Median :113.0 Median :104.0 Median :119.0
## Mean :116.4 Mean :112.6 Mean :120.4
## 3rd Qu.:138.0 3rd Qu.:138.0 3rd Qu.:143.0
## Max. :213.0 Max. :233.0 Max. :218.0
d_scaled <- d %>% select(-`User Id`) %>% scale()
head(d_scaled)
## Sports Religious Nature Theatre Shopping Picnic
## [1,] -1.509552 -1.0100142 -0.9973422 -1.4744331 -1.0740003 -0.7783943
## [2,] -1.509552 -1.4722052 -1.0630749 -1.2565864 -1.0499404 -1.6057691
## [3,] -1.509552 -1.8419580 -0.6029459 -0.9142560 -1.5070790 -1.3912645
## [4,] -1.509552 -1.2873288 -1.0411640 -0.6652884 -0.8815209 -1.8202736
## [5,] -1.509552 -0.3629468 -1.5451149 -1.7856426 -0.4243823 -1.0541859
## [6,] -1.358415 -1.7803325 -0.3400150 -0.7275303 -1.4589591 -1.3606210
d_scaled_tall <- d_scaled %>% as_tibble() %>% gather(key = "variable")
ggplot(data = d_scaled_tall, aes(x = variable, y = value)) +
geom_boxplot()
fviz_nbclust(d_scaled, kmeans, method = "wss") +
geom_vline(xintercept = 4, linetype = 2) +
labs(subtitle = "Elbow method")
fviz_nbclust(d_scaled, kmeans, method = "silhouette") +
labs(subtitle = "Silhouette method")
set.seed(123)
fviz_nbclust(d_scaled, kmeans, method = "gap_stat", nboot = 50)+
labs(subtitle = "Gap statistic method")
set.seed(123)
km_clusters <- kmeans(d_scaled, centers = 3, nstart = 25)
fviz_cluster(km_clusters, data = d_scaled)
km_clusters$size
## [1] 39 113 97
km_clusters$centers %>% t() %>% round(3)
## 1 2 3
## Sports 0.595 -0.917 0.829
## Religious 1.608 -0.659 0.121
## Nature -0.832 -0.509 0.927
## Theatre -0.183 -0.569 0.736
## Shopping 1.668 -0.616 0.047
## Picnic 0.562 -0.790 0.694
centroids_tall <- km_clusters$centers %>%
as_tibble() %>%
mutate(cluster = rownames(km_clusters$centers)) %>%
gather(key = var, value = value, Sports:Picnic)
ggplot(data = centroids_tall,
aes(x = cluster, y = value, fill = cluster)) +
geom_bar(stat = "identity") +
facet_wrap(~ var, ncol = 3)
sil <- silhouette(km_clusters$cluster, dist(d_scaled))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 39 0.38
## 2 2 113 0.47
## 3 3 97 0.22
d_mutated <- d %>%
mutate(Total_reviews = Sports + Religious + Nature + Theatre + Shopping + Picnic,
Sports = Sports/Total_reviews,
Religious = Religious/Total_reviews,
Nature = Nature/Total_reviews,
Theatre = Theatre/Total_reviews,
Shopping = Shopping/Total_reviews,
Picnic = Picnic/Total_reviews)
rownames(d_mutated) <- d_mutated$`User Id`
d_mutated <- d_mutated %>% select(-`User Id`) %>% scale()
head(d_mutated)
## Sports Religious Nature Theatre Shopping Picnic
## User 1 -1.880073 0.3522515 -0.1201255 -0.4723631 -0.2730941 1.4201734
## User 2 -1.805406 -0.2400670 0.0874366 0.4113481 0.1551331 -0.3063525
## User 3 -1.822847 -1.2490676 0.9579022 1.0031802 -1.0004809 0.2060185
## User 4 -1.859398 -0.1372973 -0.1103104 1.2256926 0.2577910 -1.3799321
## User 5 -1.887305 1.7460634 -1.1879540 -1.0982110 1.0812211 0.5623982
## User 6 -1.510017 -1.3425196 1.1929387 1.0161042 -1.0702523 -0.1447961
## Total_reviews
## User 1 -1.604465
## User 2 -1.893061
## User 3 -1.830662
## User 4 -1.690264
## User 5 -1.573265
## User 6 -1.643464
cor(d_mutated) %>% round(3)
## Sports Religious Nature Theatre Shopping Picnic
## Sports 1.000 -0.183 0.097 -0.128 -0.035 0.029
## Religious -0.183 1.000 -0.899 -0.427 0.774 0.002
## Nature 0.097 -0.899 1.000 0.178 -0.854 0.183
## Theatre -0.128 -0.427 0.178 1.000 -0.389 -0.628
## Shopping -0.035 0.774 -0.854 -0.389 1.000 -0.279
## Picnic 0.029 0.002 0.183 -0.628 -0.279 1.000
## Total_reviews 0.947 -0.024 -0.069 -0.223 0.120 0.070
## Total_reviews
## Sports 0.947
## Religious -0.024
## Nature -0.069
## Theatre -0.223
## Shopping 0.120
## Picnic 0.070
## Total_reviews 1.000
d_mutated <- as_tibble(d_mutated) %>% select(-Shopping)
cl_nb <- NbClust(data = d_mutated, distance = "euclidean",
min.nc = 2, max.nc = 9,
method = "kmeans", index = "all")
## *** : The Hubert index is a graphical method of determining the number of clusters.
## In the plot of Hubert index, we seek a significant knee that corresponds to a
## significant increase of the value of the measure i.e the significant peak in Hubert
## index second differences plot.
##
## *** : The D index is a graphical method of determining the number of clusters.
## In the plot of D index, we seek a significant knee (the significant peak in Dindex
## second differences plot) that corresponds to a significant increase of the value of
## the measure.
##
## *******************************************************************
## * Among all indices:
## * 4 proposed 2 as the best number of clusters
## * 4 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 4 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 4 proposed 8 as the best number of clusters
## * 2 proposed 9 as the best number of clusters
##
## ***** Conclusion *****
##
## * According to the majority rule, the best number of clusters is 2
##
##
## *******************************************************************
fviz_nbclust(cl_nb)
## Among all indices:
## ===================
## * 2 proposed 0 as the best number of clusters
## * 1 proposed 1 as the best number of clusters
## * 4 proposed 2 as the best number of clusters
## * 4 proposed 3 as the best number of clusters
## * 3 proposed 4 as the best number of clusters
## * 1 proposed 5 as the best number of clusters
## * 4 proposed 6 as the best number of clusters
## * 1 proposed 7 as the best number of clusters
## * 4 proposed 8 as the best number of clusters
## * 2 proposed 9 as the best number of clusters
##
## Conclusion
## =========================
## * According to the majority rule, the best number of clusters is 2 .
set.seed(127)
km_clusters2 <- kmeans(d_mutated, centers = 3, nstart = 25)
fviz_cluster(km_clusters2, data = d_mutated)
km_clusters2$size
## [1] 86 78 85
km_clusters2$centers %>% t() %>% round(3)
## 1 2 3
## Sports -0.762 1.028 -0.172
## Religious -0.434 -0.696 1.077
## Nature 0.394 0.668 -1.011
## Theatre 0.686 -0.050 -0.649
## Picnic -0.464 0.210 0.277
## Total_reviews -0.854 0.906 0.033
centroids_tall2 <- km_clusters2$centers %>%
as_tibble() %>%
mutate(cluster = rownames(km_clusters2$centers)) %>%
gather(key = var, value = value, Sports:Total_reviews)
ggplot(data = centroids_tall2,
aes(x = cluster, y = value, fill = cluster)) +
geom_bar(stat = "identity") +
facet_wrap(~ var, ncol = 3)
sil <- silhouette(km_clusters2$cluster, dist(d_mutated))
fviz_silhouette(sil)
## cluster size ave.sil.width
## 1 1 86 0.30
## 2 2 78 0.27
## 3 3 85 0.23
set.seed(127)
hclust_complete_sample <- eclust(sample(as.matrix(d_mutated), 40), FUNcluster="hclust", hc_method="complete", k = 3)
fviz_dend(hclust_complete_sample, show_labels=T, main = 'Dendrogram using complete linkage')
hclust_complete <- eclust(d_mutated, FUNcluster="hclust", hc_method="complete", k = 4)
fviz_dend(hclust_complete, show_labels=F, main = 'Dendrogram using complete linkage')
fviz_cluster(hclust_complete)
fviz_silhouette(hclust_complete)
## cluster size ave.sil.width
## 1 1 77 0.17
## 2 2 56 0.19
## 3 3 52 0.31
## 4 4 64 0.26
cbind(d_mutated, cluster = hclust_complete$cluster) %>%
as_tibble() %>%
gather(key = var, value = value, Sports:Total_reviews) %>%
group_by(cluster, var) %>%
summarize(value = mean(value)) %>%
ggplot(aes(x = cluster, y = value, fill = as.character(cluster))) +
geom_bar(stat = "identity") +
facet_wrap(~ var, ncol = 3)
hclust_single <- eclust(d_mutated, FUNcluster="hclust", hc_method = "single", k=4)
fviz_dend(hclust_single, show_labels=F, main = 'Dendrogram using single linkage')
fviz_cluster(hclust_single)
fviz_silhouette(hclust_single)
## cluster size ave.sil.width
## 1 1 246 -0.18
## 2 2 1 0.00
## 3 3 1 0.00
## 4 4 1 0.00
hclust_average <- eclust(d_mutated, FUNcluster="hclust", hc_method ="average",k=4)
fviz_dend(hclust_average, show_labels=F, main = 'Dendrogram using average linkage')
fviz_cluster(hclust_average)
fviz_silhouette(hclust_average)
## cluster size ave.sil.width
## 1 1 53 0.27
## 2 2 102 0.17
## 3 3 52 0.26
## 4 4 42 0.34